This report contains code made by Hannah including code for presentation, report, and also own model selection exploratory.

library(tidyverse)
library(visdat)
library(dplyr)
theme_set(theme_bw())

Data Description

Our data is Student Performance from UC Irvine Machine Learning Repository. We have 2 data sets but both have same 30 features with different entry values. The data is from two Portuguese secondary education schools collecting data via school reports and questionnaires.

Data source URL: https://archive.ics.uci.edu/dataset/320/student+performance

Each data set has G3 column which is the final grade we want to predict. One data set is Math final grade related and the other is Portuguese final grade related. There are no missing values in both data sets and there is strong correlation between G3, G1, and G2 because G1 and G2 are grades from period 1 and 2.

There are more rows in Portuguese data set than Math data set. Math Data is 395 x 33 dimension and Portuguese Data is 649 x 33.

# Load data
og.d1=read.table("student/student-mat.csv",sep=";",header=TRUE) # d1 is Math
og.d2=read.table("student/student-por.csv",sep=";",header=TRUE) # d2 is Portuguese

d1 <- og.d1 %>% filter(G3 != 0) # filter to select rows where G3 is not zero
d2 <- og.d2 %>% filter(G3 != 0) # filter to select rows where G3 is not zero
d2 <- og.d2 %>% filter(G3 != 1) # filter to select rows where G3 is not zero

log.d1 <- d1 
log.d1$G3 <- log(d1$G3)
log.d2 <- d2 
log.d2$G3 <- log(d2$G3)

Code for EDA for the report

dataset_table <- data.frame(
  Variable_Names = c("sex, schoolsup, famsup, paid, activities, nursery, higher, internet, romantic", "school, address, famsize, Pstatus, Mjob, Fjob, reason, guardian", "Medu, Fedu, traveltime, studytime, failures", "famrel, freetime, goout, Dalc, Walc, health","age, absences, G1, G2, G3"),
  Type = c("Binary", "Nominal Category", "Ordinal Category", "Likert Scale", "Discrete Numerical"),
  Class = c("Boolean", "Character", "Integer", "Integer", "Integer")
)

library(dplyr)
library(ggplot2)
library(patchwork)

# Load data
og.d1=read.table("student/student-mat.csv",sep=";",header=TRUE) # d1 is Math
og.d2=read.table("student/student-por.csv",sep=";",header=TRUE) # d2 is Portuguese

my.og.d1 <- og.d1
my.og.d2 <- og.d2
my.og.d1$Subject = "Math"
my.og.d2$Subject = "Portuguese"

my.og.d1 <- my.og.d1 %>% select(Subject, G3)
my.og.d2 <- my.og.d2 %>% select(Subject, G3)

combined <- rbind(my.og.d1, my.og.d2)

boxplot <- combined |> ggplot() + aes(x = G3) + 
  geom_boxplot() +
  facet_grid( . ~ Subject) +
  labs(x = "G3")

barplot <- combined |> ggplot() + 
   geom_bar(aes(x = G3), fill = "skyblue") +
   facet_grid( . ~ Subject) +
  labs(y = "Students", x = NULL, title = "Distribution of G3 (final grade)")
  

barplot / boxplot

We have decided to filter the data sets to only look at rows where G3 is not 0. This is because G3 can be an integer between 0 and 20 and presumably, it is more likely that students are absent more than student try and get 0 on the test.

Check Missing Data (This graph is not important)
visdat::vis_miss(d1) + coord_flip() + theme(legend.position = "none")

visdat::vis_miss(d2) + coord_flip() + theme(legend.position = "none")

Both data sets show that there are no missing values.

Check Variable Distribution For Numerical Data

Math Data Set

d1 %>%
  pivot_longer(cols = where(is.numeric)) %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black", alpha = 0.7) +
  facet_wrap(~name, scales = "free") +
  theme_minimal() +
  labs(x = "Numerical Features", y = "Count", title = "Math Data Distribution of Each Numerical Feature")

Other than absences, Dalc, failures, health, Medu, traveltime, Walc variables, the rest of the variables are normally distributed.

Portuguese Dataset

d2 %>%
  pivot_longer(cols = where(is.numeric)) %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "black", alpha = 0.7) +
  facet_wrap(~name, scales = "free") +
  theme_minimal() +
  labs(x = "Numerical Features", y = "Count", title = "Portuguese Data Distribution of Each Numerical Feature")

Other than absences, Dalc, failures, health, Medu, traveltime, Walc variables, the rest of the variables are normally distributed.

Both Math and Portuguese Data sets are similarly distributed.

Check Variable Relationship With G3 and transformed datasets

log(Math) numerical

log.d1 %>% 
  select(G2, famrel, absences, Medu, goout, G1, failures, age, Dalc, traveltime, G3) %>% 
  pivot_longer(-G3) %>%
  ggplot(aes(x=value,y=G3))  + 
  geom_jitter(width=0.1,height=1,alpha=0.2)+ 
  geom_smooth(method= "lm", color = "red", se = FALSE)+
  facet_wrap(~name,scales='free') + 
  labs(x = "Independent Features", y = "Log G3", title = "Math Data Relationship with Each Numerical Feature")
## `geom_smooth()` using formula = 'y ~ x'

For Math data set, we can see strong correlation between G1 and G3 and G2 and G3. There is also slight correlation between failures, age, Dalc, goout, Medu vs G3.

log(Portuguese) numerical

log.d2 %>% 
  select(G2, famrel, absences, Medu, goout, G1, failures, age, Dalc, traveltime, G3) %>% 
  pivot_longer(-G3) %>% 
  ggplot(aes(x=value,y=G3))  + 
  geom_jitter(width=0.1,height=1,alpha=0.2)+ 
  geom_smooth(method= "lm", color = "red", se = FALSE)+
  facet_wrap(~name,scales='free') + 
  labs(x = "Independent Features", y = "Log G3", title = "Portuguese Data Relationship with Each Numerical Feature")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 150 rows containing non-finite outside the scale range
## (`stat_smooth()`).

log(Math) categorical

log.d1 %>% select(nursery, Fjob, G3) %>% pivot_longer(-G3) %>% 
  ggplot(aes(x = value, y = G3, fill = value)) +
  geom_boxplot(color = "black", alpha = 0.7) +
  facet_wrap(~name, scales = 'free') +
  labs(x = "Independent Features", y = "Log G3", title = "Math Data Relationship with Each Categorical Feature") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust = 0.5, hjust = 1)) +guides(fill = "none")

log(Portuguese) categorical

log.d2 %>% select(nursery, Fjob, G3) %>% pivot_longer(-G3) %>% 
  ggplot(aes(x = value, y = G3, fill = value)) +
  geom_boxplot(color = "black", alpha = 0.7) +
  facet_wrap(~name, scales = 'free') +
  labs(x = "Independent Features", y = "Log G3", title = "Portuguese Data Relationship with Each Categorical Feature") +
  theme_minimal() +
  theme(axis.text.x = element_text(vjust = 0.5, hjust = 1)) +guides(fill = "none")
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

The code under this is not included in the Powerpoint

Focusing on log(math$G3) relationship with ‘address’, ‘Medu’, ‘Mjob’, ‘traveltime’, ‘nursery’.

Also removing ‘paid’, ‘famrel’, ‘goout’, ‘health’, and ‘absences’

selected.log.d1 <- log.d1 %>% 
  select(address, Medu, Mjob, traveltime, nursery, age, Dalc, failures, Fedu, freetime, Medu, studytime, traveltime, Walc, G3)

long.d1 <- selected.log.d1 %>% mutate(across(-G3, as.character))
long.d1 <- long.d1 %>%
  pivot_longer(cols = -G3)

long.d1 %>%
  ggplot(aes(x=value,y=G3))  + geom_jitter(width=0.1,height=1,alpha=0.2)+
  facet_wrap(~name,scales='free') + 
  labs(x = "Independent Features", y = "Log G3", title = "Math Data Relationship with Each Categorical Feature")+
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) 

long.d1 %>%
  ggplot(aes(x = value, y = G3)) +
  geom_col(fill = "skyblue", color = "black", alpha = 0.7) +
  facet_wrap(~name, scales = 'free') +
  labs(x = "Independent Features (value)", y = "Log G3", title = "Relationship Between Log G3 and Categorical Features") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

selected.d1.categorical <- d1 %>% 
  select_if(is.character)  %>%
  bind_cols(d1 %>% select(G3))

selected.d1.categorical.long <- selected.d1.categorical %>% mutate(across(-G3))

selected.d1.categorical.long <- selected.d1.categorical.long %>%
  pivot_longer(cols = -G3)

selected.d1.categorical.long %>%
  ggplot(aes(x = value, y = G3)) +
  geom_boxplot(fill = "skyblue", color = "black", alpha = 0.7) +
  facet_wrap(~name, scales = 'free') +
  labs(x = "Independent Features (value)", y = "Log G3", title = "Relationship Between Log G3 and Categorical Features") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))

log.d2 %>% select(where(is.numeric)) %>% pivot_longer(-G3) %>% 
  ggplot(aes(x=value,y=G3))+geom_jitter(width=0.1,height=1,alpha=0.2)+
  facet_wrap(~name,scales='free') + 
  labs(x = "Independent Features", y = "Log G3", title = "Portuguese Data Relationship with Each Numerical Feature")

For Porturgese data set, we can see strong correlation between G1 and G3 and G2 and G3. There is also slight correlation between failures, absences, famrel, age, Dalc, Medu vs G3.

Focusing on log(math$G3) relationship without ‘internet’ variable.
selected.log.d2 <- log.d2 %>% 
  select(address, Medu, Mjob, traveltime, nursery, age, Dalc, failures, Fedu, freetime, Medu, studytime, traveltime, Walc, G3)

long.d2 <- selected.log.d2 %>% mutate(across(-G3, as.character))
long.d2 <- long.d2 %>%
  pivot_longer(cols = -G3)

long.d2 %>%
  ggplot(aes(x=value,y=G3))  + geom_jitter(width=0.1,height=1,alpha=0.2)+
  facet_wrap(~name,scales='free') + 
  labs(x = "Independent Features", y = "Log G3", title = "Portuguese Data Relationship with Each Categorical Feature")+
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1)) 

long.d2 %>%
  ggplot(aes(x = value, y = G3)) +
  geom_col(fill = "skyblue", color = "black", alpha = 0.7) +
  facet_wrap(~name, scales = 'free') +
  labs(x = "Independent Features (value)", y = "Log G3", title = "Relationship Between Log G3 and Categorical Features") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))
## Warning: Removed 180 rows containing missing values or values outside the scale range
## (`geom_col()`).

long.d2 %>%
  ggplot(aes(x = value, y = G3)) +
  geom_boxplot(fill = "skyblue", color = "black", alpha = 0.7) +
  facet_wrap(~name, scales = 'free') +
  labs(x = "Independent Features (value)", y = "Log G3", title = "Relationship Between Log G3 and Categorical Features") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust = 1))
## Warning: Removed 180 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Pearson correlation test for log(dataset$G3)

Assumption: Continuous data, normally distributed, linear relationship between two, no outliers.

log.d1.results <- data.frame(Independent.Variable = character(), Correlation = numeric(), p_value = numeric(), stringsAsFactors = FALSE)

log.d1 <- log.d1 %>% select(where(is.numeric))

for (col in names(log.d1)) {
  # Skip the target variable
  if (col != "G3") {
    # Perform the Pearson correlation test
    test_result <- cor.test(log.d1[[col]], log.d1$G3)
    
    # Store the results
    log.d1.results <- rbind(log.d1.results, data.frame(Independent.Variable = col, 
                                          correlation = test_result$estimate, 
                                          p_value = test_result$p.value))
  }
}
sorted.log.d1.results <- log.d1.results[order(-log.d1.results$correlation), ]

sorted.log.d1.results
##       Independent.Variable correlation       p_value
## cor14                   G2  0.94381602 8.790074e-173
## cor13                   G1  0.86944043 9.370251e-111
## cor1                  Medu  0.16040880  2.366456e-03
## cor2                  Fedu  0.14882218  4.836019e-03
## cor4             studytime  0.12900290  1.472432e-02
## cor6                famrel  0.03632547  4.938688e-01
## cor7              freetime -0.03793524  4.749127e-01
## cor11               health -0.06699416  2.066621e-01
## cor3            traveltime -0.08237070  1.202962e-01
## cor                    age -0.12875830  1.491543e-02
## cor9                  Dalc -0.12998506  1.397840e-02
## cor10                 Walc -0.17672493  7.969724e-04
## cor8                 goout -0.19477093  2.133144e-04
## cor12             absences -0.22105304  2.505826e-05
## cor5              failures -0.30463099  4.205888e-09
log.d2.results <- data.frame(Independent.Variable = character(), Correlation = numeric(), p_value = numeric(), stringsAsFactors = FALSE)

log.d2 <- log.d2 %>% select(where(is.numeric))

for (col in names(log.d2)) {
  # Skip the target variable
  if (col != "G3") {
    # Perform the Pearson correlation test
    test_result <-  cor.test(log.d2[[col]], log.d2$G3)
    
    # Store the results
    log.d2.results <- rbind(log.d2.results, data.frame(Independent.Variable = col, 
                                          correlation = test_result$estimate, 
                                          p_value = test_result$p.value))
  }
}
sorted.log.d2.results <- log.d2.results[order(-log.d2.results$correlation), ]

sorted.log.d2.results
##       Independent.Variable correlation p_value
## cor                    age         NaN     NaN
## cor1                  Medu         NaN     NaN
## cor2                  Fedu         NaN     NaN
## cor3            traveltime         NaN     NaN
## cor4             studytime         NaN     NaN
## cor5              failures         NaN     NaN
## cor6                famrel         NaN     NaN
## cor7              freetime         NaN     NaN
## cor8                 goout         NaN     NaN
## cor9                  Dalc         NaN     NaN
## cor10                 Walc         NaN     NaN
## cor11               health         NaN     NaN
## cor12             absences         NaN     NaN
## cor13                   G1         NaN     NaN
## cor14                   G2         NaN     NaN
p <- ggplot(d1, aes(x=G3)) + 
  geom_histogram(aes(y=after_stat(density)), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#FF6666") 
# Add mean line
p+ geom_vline(aes(xintercept=mean(G3)),
            color="blue", linetype="dashed", size=1) + ggtitle(label = "Math Data Set")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p <- ggplot(og.d1, aes(x=G3)) + 
  geom_histogram(aes(y=after_stat(density)), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#FF6666") 
# Add mean line
p+ geom_vline(aes(xintercept=mean(G3)),
            color="blue", linetype="dashed", size=1) + ggtitle(label = "Math Data Set")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p <- ggplot(d2, aes(x=G3)) + 
  geom_histogram(aes(y=after_stat(density)), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#FF6666") 
# Add mean line
p+ geom_vline(aes(xintercept=mean(G3)),
            color="blue", linetype="dashed", size=1) + ggtitle(label = "Porturgese Data Set")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

END DATA DESCRIPTION ^^^

Models For Math d1

Multiple Stepwise Regression Model

null.model <- lm(G3 ~ 1, data = d1)
full.model <- lm(G3 ~ ., data = d1)

stepwise.model <- step(null.model, scope = list(lower = null.model, upper = full.model),
                       direction = "both", trace = FALSE)
stepwise.model
## 
## Call:
## lm(formula = G3 ~ G2 + G1 + famrel + goout + health + absences + 
##     paid, data = d1)
## 
## Coefficients:
## (Intercept)           G2           G1       famrel        goout       health  
##     0.32012      0.87580      0.11082      0.15768     -0.08193     -0.06656  
##    absences      paidyes  
##    -0.01054     -0.12377
summary(stepwise.model) 
## 
## Call:
## lm(formula = G3 ~ G2 + G1 + famrel + goout + health + absences + 
##     paid, data = d1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2967 -0.4035 -0.1005  0.5770  2.5826 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.320121   0.317273   1.009 0.313685    
## G2           0.875800   0.032217  27.185  < 2e-16 ***
## G1           0.110822   0.030908   3.586 0.000384 ***
## famrel       0.157679   0.048662   3.240 0.001309 ** 
## goout       -0.081935   0.039709  -2.063 0.039814 *  
## health      -0.066562   0.030759  -2.164 0.031142 *  
## absences    -0.010540   0.005396  -1.953 0.051589 .  
## paidyes     -0.123767   0.085546  -1.447 0.148854    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8057 on 349 degrees of freedom
## Multiple R-squared:  0.9389, Adjusted R-squared:  0.9377 
## F-statistic: 766.4 on 7 and 349 DF,  p-value: < 2.2e-16

Backward Variable Selection Model

# FULL MODEL START
M1 = lm(G3 ~ ., data = d1)
# drop1(M1, test = "F")

# Remove Mjob since it has largest F-statistics p-value 
M2 = update(M1, . ~ . -internet)
# drop1(M2, test = "F")

M3 = update(M2, . ~ . -Walc)
# drop1(M3, test = "F")

M4 = update(M3, . ~ . -higher)
# drop1(M4, test = "F")

M5 = update(M4, . ~ . -romantic)
# drop1(M5, test = "F")

M6 = update(M5, . ~ . -activities)
# drop1(M6, test = "F")

M7 = update(M6, . ~ . -freetime)
# drop1(M7, test = "F")

M8 = update(M7, . ~ . -Fjob)
# drop1(M8, test = "F")

M9 = update(M8, . ~ . -sex)
# drop1(M9, test = "F")

M10 = update(M9, . ~ . -failures)
# drop1(M10, test = "F")

M11 = update(M10, . ~ . -Dalc)
# drop1(M11, test = "F")

M12 = update(M11, . ~ . -Fedu)
# drop1(M12, test = "F")

M13 = update(M12, . ~ . -studytime)
# drop1(M13, test = "F")

M14 = update(M13, . ~ . -reason)
# drop1(M14, test = "F")

M15 = update(M14, . ~ . -traveltime)
# drop1(M15, test = "F")

M16 = update(M15, . ~ . -school)
# drop1(M16, test = "F")

M17 = update(M16, . ~ . -Medu)
# drop1(M17, test = "F")

M18 = update(M17, . ~ . -famsize)
# drop1(M18, test = "F")

M19 = update(M18, . ~ . -age)
# drop1(M19, test = "F")

M20 = update(M19, . ~ . -famsup)
# drop1(M20, test = "F")

M21 = update(M20, . ~ . -schoolsup)
# drop1(M21, test = "F")

M22 = update(M21, . ~ . -guardian)
# drop1(M22, test = "F")

M23 = update(M22, . ~ . -address)
drop1(M23, test = "F")
## Single term deletions
## 
## Model:
## G3 ~ Pstatus + Mjob + paid + nursery + famrel + goout + health + 
##     absences + G1 + G2
##          Df Sum of Sq    RSS     AIC  F value    Pr(>F)    
## <none>                220.16 -144.56                       
## Pstatus   1      0.82 220.99 -145.23   1.2829  0.258152    
## Mjob      4      4.65 224.82 -145.09   1.8130  0.125823    
## paid      1      1.81 221.98 -143.63   2.8215  0.093922 .  
## nursery   1      1.67 221.84 -143.85   2.6083  0.107224    
## famrel    1      7.61 227.77 -134.43  11.8557  0.000646 ***
## goout     1      3.13 223.29 -141.53   4.8686  0.028011 *  
## health    1      3.63 223.79 -140.72   5.6547  0.017956 *  
## absences  1      2.45 222.61 -142.61   3.8150  0.051607 .  
## G1        1      7.02 227.19 -135.35  10.9408  0.001040 ** 
## G2        1    461.71 681.87  257.02 719.3078 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
M24 = update(M23, . ~ . -Pstatus)
drop1(M24, test = "F")
## Single term deletions
## 
## Model:
## G3 ~ Mjob + paid + nursery + famrel + goout + health + absences + 
##     G1 + G2
##          Df Sum of Sq    RSS     AIC  F value    Pr(>F)    
## <none>                220.99 -145.23                       
## Mjob      4      4.59 225.58 -145.89   1.7856 0.1312168    
## paid      1      1.97 222.96 -144.06   3.0631 0.0809789 .  
## nursery   1      1.45 222.44 -144.89   2.2566 0.1339592    
## famrel    1      7.52 228.50 -135.29  11.7003 0.0007001 ***
## goout     1      3.08 224.07 -142.29   4.7906 0.0292876 *  
## health    1      3.77 224.76 -141.18   5.8735 0.0158857 *  
## absences  1      2.13 223.12 -143.79   3.3232 0.0691771 .  
## G1        1      6.83 227.82 -136.36  10.6334 0.0012217 ** 
## G2        1    464.93 685.92  257.13 723.7250 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
M25 = update(M24, . ~ . -nursery)
drop1(M25, test = "F")
## Single term deletions
## 
## Model:
## G3 ~ Mjob + paid + famrel + goout + health + absences + G1 + 
##     G2
##          Df Sum of Sq    RSS     AIC  F value    Pr(>F)    
## <none>                222.44 -144.89                       
## Mjob      4      4.12 226.56 -146.33   1.5991 0.1740882    
## paid      1      2.25 224.69 -143.30   3.4896 0.0626017 .  
## famrel    1      7.34 229.77 -135.31  11.3787 0.0008273 ***
## goout     1      3.24 225.68 -141.72   5.0296 0.0255517 *  
## health    1      3.59 226.03 -141.18   5.5644 0.0188865 *  
## absences  1      2.20 224.63 -143.38   3.4057 0.0658289 .  
## G1        1      6.79 229.22 -136.16  10.5240 0.0012936 ** 
## G2        1    464.62 687.06  255.72 720.6264 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
M26 = update(M25, . ~ . -Mjob)
drop1(M26, test = "F")
## Single term deletions
## 
## Model:
## G3 ~ paid + famrel + goout + health + absences + G1 + G2
##          Df Sum of Sq    RSS     AIC  F value    Pr(>F)    
## <none>                226.56 -146.33                       
## paid      1      1.36 227.92 -146.20   2.0932 0.1488539    
## famrel    1      6.82 233.38 -137.75  10.4994 0.0013090 ** 
## goout     1      2.76 229.33 -144.00   4.2576 0.0398145 *  
## health    1      3.04 229.60 -143.57   4.6829 0.0311416 *  
## absences  1      2.48 229.04 -144.45   3.8152 0.0515890 .  
## G1        1      8.35 234.91 -135.42  12.8565 0.0003842 ***
## G2        1    479.74 706.31  257.59 739.0055 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
M27 = update(M26, . ~ . -goout)
drop1(M27, test = "F")
## Single term deletions
## 
## Model:
## G3 ~ paid + famrel + health + absences + G1 + G2
##          Df Sum of Sq    RSS     AIC  F value    Pr(>F)    
## <none>                229.33 -144.00                       
## paid      1      1.48 230.80 -143.71   2.2556  0.134034    
## famrel    1      6.52 235.84 -136.00   9.9460  0.001751 ** 
## health    1      2.92 232.24 -141.49   4.4528  0.035554 *  
## absences  1      2.65 231.98 -141.90   4.0504  0.044927 *  
## G1        1      8.65 237.97 -132.79  13.1992  0.000322 ***
## G2        1    483.07 712.40  258.65 737.2708 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
M28 = update(M27, . ~ . -absences)
drop1(M28, test = "F")
## Single term deletions
## 
## Model:
## G3 ~ paid + famrel + health + G1 + G2
##        Df Sum of Sq    RSS     AIC  F value    Pr(>F)    
## <none>              231.98 -141.90                       
## paid    1      1.35 233.33 -141.82   2.0464 0.1534525    
## famrel  1      7.08 239.06 -133.17  10.7062 0.0011738 ** 
## health  1      2.75 234.73 -139.69   4.1623 0.0420804 *  
## G1      1      7.50 239.48 -132.54  11.3485 0.0008389 ***
## G2      1    522.29 754.27  277.04 790.2612 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
M29 = update(M28, . ~ . -paid)
# drop1(M29, test = "F")

M30 = update(M29, . ~ . -health)
# drop1(M30, test = "F")

M31 = update(M30, . ~ . -G1)
# drop1(M31, test = "F")

M32 = update(M31, . ~ . -famrel)
# drop1(M32, test = "F")

M33 = update(M32, . ~ . -G2)
# drop1(M33, test = "F")

Model selection method: The smaller the AIC the better the model; Models that differ by less than one or two AIC values can be regarded as somewhat equally well fitting.

Order of removal : internet, Walc, higher, romantic, activities, freetime, Fjob, sex , failures, Dalc, Fedu, studytime, reason, traveltime, school, Medu, famsize, age, famsup, schoolsup, guardian, address, Pstatus, nursery, Mjob, goout, absences, paid, health, G1, famrel, G2, G3

Model Coefficient Analysis

For backward variable selection, this model has the lowest AIC of -146.33.

Model: G3 ~ paid + famrel + goout + health + absences + G1 + G2 Df Sum of Sq RSS AIC F value Pr(>F)
226.56 -146.33
paid 1 1.36 227.92 -146.20 2.0932 0.1488539
famrel 1 6.82 233.38 -137.75 10.4994 0.0013090 ** goout 1 2.76 229.33 -144.00 4.2576 0.0398145 *
health 1 3.04 229.60 -143.57 4.6829 0.0311416 *
absences 1 2.48 229.04 -144.45 3.8152 0.0515890 .
G1 1 8.35 234.91 -135.42 12.8565 0.0003842 G2 1 479.74 706.31 257.59 739.0055 < 2.2e-16 — Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’ 1

Comparing to step model where the output is this:

Call: lm(formula = G3 ~ G2 + G1 + famrel + goout + health + absences + paid, data = d1)

Coefficients: (Intercept) G2 G1 famrel goout health absences paidyes
0.32012 0.87580 0.11082 0.15768 -0.08193 -0.06656 -0.01054 -0.12377

Both model selection method selects the same variables for multiregression to predict G3.

Model Performance Analysis

par(mfrow=c(2,2))
plot(stepwise.model)

library(ggfortify)
autoplot(stepwise.model,which=1:2)

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(ggplot2)
library(broom)

coef_df <- tidy(stepwise.model)

# Coefficient Plot
coef_plot <- ggplot(coef_df, aes(x = term, y = estimate)) +
  geom_point() +
  geom_errorbar(aes(ymin = estimate - std.error * 1.96, ymax = estimate + std.error * 1.96), width = 0.2) +
  coord_flip() +
  labs(title = "Coefficient Plot", x = "Predictor", y = "Estimate") +
  theme_minimal()

# Actual vs. Predicted Plot
predicted_values <- augment(stepwise.model)

actual_predicted_plot <- ggplot(predicted_values, aes(x = .fitted, y = G3)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "blue") +
  labs(title = "Actual vs. Predicted", x = "Predicted G3", y = "Actual G3") +
  theme_minimal()

# Residual Plot
residual_plot <- ggplot(predicted_values, aes(x = .fitted, y = .resid)) +
  geom_point(alpha = 0.6) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residual Plot", x = "Fitted Values", y = "Residuals") +
  theme_minimal()

# Print plots
print(coef_plot)

print(actual_predicted_plot)
## `geom_smooth()` using formula = 'y ~ x'

print(residual_plot)

predicted_values <- augment(stepwise.model)

melted_data <- predicted_values %>%
  melt(measure.vars = c("G2", "G1", "famrel", "goout", "health", "absences", "paid"), 
       variable.name = "IV")


ggplot(melted_data, aes(value, G3)) +  
  geom_point(alpha = 0.6) + 
  geom_smooth(method = "lm", se = FALSE) +
  geom_jitter(width=0.1,height=1,alpha=0.2)+
  facet_wrap(~IV, scales = "free_x") +  
  labs(title = "Effect of Independent Variables on G3",
       x = "Independent Variables (IV)",
       y = "Actual G3") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

stepwise.model
## 
## Call:
## lm(formula = G3 ~ G2 + G1 + famrel + goout + health + absences + 
##     paid, data = d1)
## 
## Coefficients:
## (Intercept)           G2           G1       famrel        goout       health  
##     0.32012      0.87580      0.11082      0.15768     -0.08193     -0.06656  
##    absences      paidyes  
##    -0.01054     -0.12377
coefficients_df <- tidy(stepwise.model)

coefficients_df <- coefficients_df %>%
  mutate(abs_estimate = abs(estimate)) %>%
  arrange(desc(abs_estimate))  # Order by magnitude

coefficients_df <- coefficients_df %>%
  filter(term != "(Intercept)")

ggplot(coefficients_df, aes(x = reorder(term, abs_estimate), y = abs_estimate)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +  # Flip coordinates for better readability
  labs(title = "Magnitude of Effect of Each Predictor on G3",
       x = "Predictor Variable",
       y = "Absolute Value of Coefficient") +
  theme_minimal()

Use d1 Model on d2 data set

library(Metrics)
predictions_d1 <- predict(stepwise.model, newdata = d1)

# Calculate RMSE on d1(math)
rmse_d1 <- rmse(d1$G3, predictions_d1)

# Calculate R-squared on d1 (math)
rss_d1 <- sum((d1$G3 - predictions_d1)^2)   # Residual Sum of Squares
tss_d1 <- sum((d1$G3 - mean(d1$G3))^2)     # Total Sum of Squares
r_squared_d1 <- 1 - (rss_d1/tss_d1)

# 2. Performance on d2 (Portugese)
# Make predictions on d2
predictions_d2 <- predict(stepwise.model, newdata = d2)

# Calculate RMSE on d2
rmse_d2 <- rmse(d2$G3, predictions_d2)

# Calculate R-squared on d2
rss_d2 <- sum((d2$G3 - predictions_d2)^2)   # Residual Sum of Squares
tss_d2 <- sum((d2$G3 - mean(d2$G3))^2)     # Total Sum of Squares
r_squared_d2 <- 1 - (rss_d2/tss_d2)

cat("Performance on d1 (Training Data):\n")
## Performance on d1 (Training Data):
cat("RMSE:", rmse_d1, "\n")
## RMSE: 0.7966354
cat("R-squared:", r_squared_d1, "\n\n")
## R-squared: 0.9389164
cat("Performance on d2 (New Data):\n")
## Performance on d2 (New Data):
cat("RMSE:", rmse_d2, "\n")
## RMSE: 1.235133
cat("R-squared:", r_squared_d2, "\n")
## R-squared: 0.8512134

Exhaustive Search (Extra)

library(leaps)
exh_leaps = regsubsets(G3~., data = d1, nvmax = 15)
summary(exh_leaps)$outmat
plot(exh_leaps, scale = "Cp") 

scale = “Cp”: Plots the Mallows’ Cp statistic for each model, where lower values indicate better models. scale = “adjr2”: Plots adjusted R² for each model, where higher values indicate better models. scale = “bic”: Plots the Bayesian Information Criterion (BIC), where lower values indicate better models.

library(lmSubsets)
exh = lmSubsets(G3 ~ ., data = d1, nbest = 15)
plot(exh)